基于SVM的乳腺癌数据集分类¶

使用支持向量机(SVM)对乳腺癌数据集进行分类,分别使用Sigmoid核、线性核和高斯核进行分类,使用准确率、精确率、召回率、F1分数、ROC曲线等指标对模型进行评估,对模型的决策边界进行可视化,对比两种核函数的分类效果。

目录¶

  1. 可视化工具函数
    1. ROC曲线绘制
    2. 混淆矩阵
    3. 评估指标与分类报告
    4. 决策边界绘制
  2. 数据集
    1. 特征分布
  3. Sigmoid核函数
  4. 线性核函数
  5. 高斯核函数
In [ ]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# 设置 matplotlib 中文显示字体
plt.rcParams['font.family'] = 'Microsoft YaHei'

可视化函数¶

ROC曲线¶

In [ ]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
def draw_roc(y_true, y_score, title):
    # 计算ROC曲线
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    # 计算AUC
    auc = roc_auc_score(y_true, y_score)
    # 绘制ROC曲线
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

混淆矩阵¶

In [ ]:
# 绘制混淆矩阵
def draw_confusion_matrix(y_true, y_pred, title):
    from sklearn.metrics import confusion_matrix
    import seaborn as sns
    import matplotlib.pyplot as plt

    # 计算混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    # 绘制混淆矩阵
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

评估指标与分类报告¶

In [ ]:
# 绘制分类报告
def draw_classification_report(y_true, y_pred, title):
    from sklearn.metrics import classification_report
    import seaborn as sns
    import matplotlib.pyplot as plt

    # 计算分类报告
    report = classification_report(y_true, y_pred, target_names=['Benign', 'Malignant'], output_dict=True)

    # 绘制分类报告
    plt.figure()
    sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True)
    plt.title(title)
    plt.show()

# 绘制评估指标
def draw_metrics(y_true, y_pred, title):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    import matplotlib.pyplot as plt

    # 计算评估指标
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1score, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

    # 绘制评估指标
    fig, ax = plt.subplots(figsize=(6, 2))
    metrics = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1-score': f1score}
    labels = list(metrics.keys())
    values = list(metrics.values())
    for idx, value in enumerate(values):
        ax.text(value, idx, f'{value:.4f}', ha='right', va='center')
    bar = ax.barh(labels, values, color='#02ABEC')
    plt.xlim(0, 1)
    plt.title(title)
    plt.show()

决策边界¶

In [ ]:
def draw_decision_boundary(X, y, model, title, x_columns, display_features = ['texture_mean', 'concavity_mean']):
    from mlxtend.plotting import plot_decision_regions
    import matplotlib.pyplot as plt

    # 选择两个特征
    feature_idx = [x_columns.index(feature) for feature in display_features]
    filter_idx = [i for i in range(X.shape[1]) if i not in feature_idx]

    # 绘制决策边界
    plot_decision_regions(
        X, y, clf=model, legend=2,
        feature_index=feature_idx,
        filler_feature_values={i: 0 for i in filter_idx},
        filler_feature_ranges={i: 3 for i in filter_idx},
        ax=plt.subplots(1, 1, figsize=(8, 5))[1]
    )

    plt.xlabel(display_features[0], size=10)
    plt.ylabel(display_features[1], size=10)
    plt.title(title, size=12)
    plt.show()

学习曲线¶

In [ ]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # print(f"train_sizes: {train_sizes}, train_scores_mean: {train_scores_mean}, test_scores_mean: {test_scores_mean}")

    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()
    return

数据集¶

数据集使用 Kaggle - PRIYANKA/Breast Cancer Wisconsin ,

数据列: id, diagnosis, radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave points_worst, symmetry_worst, fractal_dimension_worst

  • diagnosis:诊断结果,M代表恶性(malignant),B代表良性(benign)。
  • radius:细胞核的半径。
  • texture:灰度值,反映了细胞核的纹理。
  • perimeter:细胞核的周长。
  • area:细胞核的面积。
  • smoothness:半径长度的变化,反映了细胞核的平滑度。
  • compactness:周长平方除以面积减去1,反映了细胞核的紧密度。
  • concavity:轮廓的凹部的严重程度。
  • concave points:轮廓凹部的数量。
  • symmetry:细胞核对称性。
  • fractal_dimension:细胞核的分形维数,反映了细胞核的复杂度。
In [ ]:
dataset = pd.read_csv("./data/breast cancer.csv")

x_columns = ["radius_mean", "texture_mean", "smoothness_mean", "area_mean", "compactness_mean", "concavity_mean",
             "concave points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 
             "smoothness_se", "smoothness_worst",  "radius_worst", "concavity_se"]

X = dataset.loc[:, x_columns].values
y = dataset.loc[:, "diagnosis"].values

数据集特征分布¶

In [ ]:
sns.pairplot(dataset.loc[:, [*x_columns, "diagnosis"]],hue = 'diagnosis', palette = {'M': '#FFC000', 'B': '#02ABEC'})
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7c347aa65750>
No description has been provided for this image
In [ ]:
all_x_columns = ["radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave points_worst", "symmetry_worst", "fractal_dimension_worst"]
all_X = dataset.loc[:, all_x_columns].values
df = pd.DataFrame(all_X, columns=all_x_columns)
df['diagnosis'] = y

plt.figure(figsize=(18, 14))
plt.suptitle('特征分布 - 卢继鹏', position=(0.5, 0.90))

for i, col in enumerate(all_x_columns):
    plt.subplot(4, 8, i + 1)
    sns.violinplot(y='diagnosis', x=col, data=df, color='#02ABEC')

plt.show()
No description has been provided for this image

将数据集分割为训练集和测试集,其中75%的数据用于训练,25%的数据用于测试。

使用StandardScaler对数据进行标准化,使数据的平均值为0,标准差为1。

In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 编码分类标签,将B良性编码为0,M恶性编码为1
y_train = np.where(y_train == 'M', 1, 0)
y_test = np.where(y_test == 'M', 1, 0)
In [ ]:
import matplotlib.pyplot as plt

feature_idx = 0

plt.figure(figsize=(5, 2))
plt.suptitle('数据归一化 - 卢继鹏', position=(0.5, 1.1))
plt.subplot(1, 2, 1)
plt.hist(X[:, feature_idx], bins=30, color='#02ABEC')
plt.title('Original Data')

plt.subplot(1, 2, 2)
plt.hist(X_train[:, feature_idx], bins=30, color='#02ABEC')
plt.title('Standardized Data')
plt.show()
No description has been provided for this image

Sigmoid核SVM¶

构建模型¶

使用scikit-learn库中的支持向量机对数据进行训练的过程。

创建一个SVC类的实例,使用Sigmoid核函数,并设置随机状态种子为0,以确保结果的可重复性。

使用X_train和y_train训练SVM分类器。

In [ ]:
from sklearn.svm import SVC
classifier_sigmoid = SVC(kernel='sigmoid', random_state=0, probability=True)
classifier_sigmoid.fit(X_train, y_train)
Out[ ]:
SVC(kernel='sigmoid', probability=True, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='sigmoid', probability=True, random_state=0)

模型决策边界¶

选择两个特征,使用plot_decision_regions绘制分类器的决策边界在这两个特征的投影。可以看到分类器在二维特征空间中的分类效果。

In [ ]:
draw_decision_boundary(X_train, y_train, classifier_sigmoid, 'Sigmoid核 SVM 决策边界 - 卢继鹏', x_columns)
No description has been provided for this image

学习曲线¶

使用learning_curve绘制学习曲线,观察训练集和测试集的准确率随着训练样本数量的变化。

训练得分:表明了模型在训练集和测试集上的准确率,可以看到随着训练样本数量的增加,模型的准确率逐渐提高。但继续增加训练样本数量可能会减少训练得分,因为模型在更大的数据集上更难拟合。

交叉验证得分:表明了模型在交叉验证集上的准确率,可以看到随着训练样本数量的增加,模型的准确率逐渐提高,但在一定数量后准确率趋于稳定。

通过观察训练得分和验证得分的变化,我们可以判断模型是否过拟合或欠拟合。如果训练得分和验证得分之间的差距较大,可能是过拟合;如果两者都较低,可能是欠拟合。

In [ ]:
plot_learning_curve(classifier_sigmoid, "Sigmoid 核 SVM 学习曲线 - 卢继鹏", X_train, y_train, cv=5)
No description has been provided for this image

模型评估¶

使用X_test测试集对模型进行评估。

对比预测结果y_pred和实际结果y_test,计算准确率。

In [ ]:
y_pred_sigmoid = classifier_sigmoid.predict(X_test)

使用sklearn.metrics库中的confusion_matrix函数,得到混淆矩阵,可以看到模型在测试集上的分类效果。

In [ ]:
draw_confusion_matrix(y_test, y_pred_sigmoid, 'Sigmoid核 SVM 混淆矩阵 - 卢继鹏')
No description has been provided for this image

使用skilearn.metrics库中的accuracy_score和classification_report等函数,得到模型的准确率、精确率、召回率、F1值等评价指标。

In [ ]:
draw_metrics(y_test, y_pred_sigmoid, 'Sigmoid核 SVM 模型评估指标 - 卢继鹏')
draw_classification_report(y_test, y_pred_sigmoid, 'Sigmoid核 SVM 分类报告 - 卢继鹏')
Accuracy: 0.9123, Precision: 0.9122, Recall: 0.9123, F1-Score: 0.9120
No description has been provided for this image
{'Benign': {'precision': 0.9130434782608695, 'recall': 0.9402985074626866, 'f1-score': 0.9264705882352942, 'support': 67.0}, 'Malignant': {'precision': 0.9111111111111111, 'recall': 0.8723404255319149, 'f1-score': 0.8913043478260869, 'support': 47.0}, 'accuracy': 0.9122807017543859, 'macro avg': {'precision': 0.9120772946859903, 'recall': 0.9063194664973008, 'f1-score': 0.9088874680306905, 'support': 114.0}, 'weighted avg': {'precision': 0.9122468005763199, 'recall': 0.9122807017543859, 'f1-score': 0.9119722259613227, 'support': 114.0}}
No description has been provided for this image

使用sklearn.metrics库中的roc_curve函数,计算ROC曲线,使用matplotlib库绘制ROC曲线,评估模型的分类效果。

In [ ]:
draw_roc(y_test, y_pred_sigmoid, 'Sigmoid核 SVM ROC曲线 - 卢继鹏')
No description has been provided for this image

线性核SVM¶

构建模型¶

In [ ]:
classifier_linear =SVC(kernel='linear', random_state=0)
classifier_linear.fit(X_train, y_train)
Out[ ]:
SVC(kernel='linear', random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='linear', random_state=0)
In [ ]:
draw_decision_boundary(X_train, y_train, classifier_linear, '线性核 SVM 决策边界 - 卢继鹏', x_columns)
No description has been provided for this image
In [ ]:
plot_learning_curve(classifier_linear, "线性核 SVM 学习曲线 - 卢继鹏", X_train, y_train, cv=5)
No description has been provided for this image

模型评估¶

In [ ]:
y_pred_linear = classifier_linear.predict(X_test)
In [ ]:
draw_confusion_matrix(y_test, y_pred_linear, '线性核SVM 混淆矩阵 - 卢继鹏')
No description has been provided for this image
In [ ]:
draw_metrics(y_test, y_pred_linear, '线性核SVM 模型评估指标 - 卢继鹏')
draw_classification_report(y_test, y_pred_linear, '线性核SVM 分类报告 - 卢继鹏')
Accuracy: 0.9737, Precision: 0.9738, Recall: 0.9737, F1-Score: 0.9736
No description has been provided for this image
{'Benign': {'precision': 0.9705882352941176, 'recall': 0.9850746268656716, 'f1-score': 0.9777777777777777, 'support': 67.0}, 'Malignant': {'precision': 0.9782608695652174, 'recall': 0.9574468085106383, 'f1-score': 0.967741935483871, 'support': 47.0}, 'accuracy': 0.9736842105263158, 'macro avg': {'precision': 0.9744245524296675, 'recall': 0.9712607176881549, 'f1-score': 0.9727598566308244, 'support': 114.0}, 'weighted avg': {'precision': 0.9737515143357114, 'recall': 0.9736842105263158, 'f1-score': 0.9736401936741494, 'support': 114.0}}
No description has been provided for this image
In [ ]:
draw_roc(y_test, y_pred_linear, '线性核SVM ROC曲线 - 卢继鹏')
No description has been provided for this image

高斯核SVM¶

构建模型¶

In [ ]:
classifier_rbf =SVC(kernel='rbf', random_state=0)
classifier_rbf.fit(X_train, y_train)
Out[ ]:
SVC(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(random_state=0)
In [ ]:
draw_decision_boundary(X_train, y_train, classifier_rbf, '高斯核 SVM 决策边界 - 卢继鹏', x_columns)
No description has been provided for this image
In [ ]:
plot_learning_curve(classifier_rbf, "高斯核 SVM 学习曲线 - 卢继鹏", X_train, y_train, cv=5)
No description has been provided for this image

模型评估¶

In [ ]:
y_pred_rbf = classifier_rbf.predict(X_test)
In [ ]:
draw_confusion_matrix(y_test, y_pred_rbf, '高斯核SVM 混淆矩阵 - 卢继鹏')
No description has been provided for this image
In [ ]:
draw_metrics(y_test, y_pred_rbf, '高斯核SVM 模型评估指标 - 卢继鹏')
draw_classification_report(y_test, y_pred_rbf, '高斯核SVM 分类报告 - 卢继鹏')
Accuracy: 0.9649, Precision: 0.9653, Recall: 0.9649, F1-Score: 0.9648
No description has been provided for this image
{'Benign': {'precision': 0.9565217391304348, 'recall': 0.9850746268656716, 'f1-score': 0.9705882352941176, 'support': 67.0}, 'Malignant': {'precision': 0.9777777777777777, 'recall': 0.9361702127659575, 'f1-score': 0.9565217391304348, 'support': 47.0}, 'accuracy': 0.9649122807017544, 'macro avg': {'precision': 0.9671497584541062, 'recall': 0.9606224198158145, 'f1-score': 0.9635549872122762, 'support': 114.0}, 'weighted avg': {'precision': 0.9652851936604796, 'recall': 0.9649122807017544, 'f1-score': 0.9647888903845291, 'support': 114.0}}
No description has been provided for this image
In [ ]:
draw_roc(y_test, y_pred_rbf, '高斯核SVM ROC曲线 - 卢继鹏')
No description has been provided for this image

三种模型的比较¶

In [ ]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_sigmoid)
auc1 = roc_auc_score(y_test, y_pred_sigmoid)

fpr2, tpr2, thresholds2 = roc_curve(y_test, y_pred_linear)
auc2 = roc_auc_score(y_test, y_pred_linear)

fpr3, tpr3, thresholds3 = roc_curve(y_test, y_pred_rbf)
auc3 = roc_auc_score(y_test, y_pred_rbf)

# 绘制ROC曲线
plt.figure()
plt.plot(fpr1, tpr1, label='Sigmoid 核SVM ROC curve (area = %0.2f)' % auc1)
plt.plot(fpr2, tpr2, label='线性核SVM ROC curve (area = %0.2f)' % auc2)
plt.plot(fpr3, tpr3, label='高斯核SVM ROC curve (area = %0.2f)' % auc3)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('不同核函数SVM ROC曲线对比 - 卢继鹏')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [ ]:
def plot_combined_learning_curve(classifiers, titles, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure(figsize=(10, 7))
    colors = ['#FF0000', '#FFC000', '#02ABEC' ]
    colors_secondaries = ['#FF4444', '#FFD066', '#40AAEC']

    for i, (clf, title) in enumerate(zip(classifiers, titles)):
        train_sizes, train_scores, test_scores = learning_curve(
            clf, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        plt.plot(train_sizes, train_scores_mean, 'o-', label=f"{title} - Training score", color=colors[i])
        plt.plot(train_sizes, test_scores_mean, 'x-', label=f"{title} - Cross-validation score", color=colors_secondaries[i])
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1, color=colors[i])
        # plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
        #                     test_scores_mean + test_scores_std, alpha=0.1, color=colors_secondaries[i])

    plt.title("不同核函数SVM 学习曲线对比 - 卢继鹏")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

# 使用你的数据和分类器
classifiers = [classifier_sigmoid, classifier_linear, classifier_rbf]
titles = ["Sigmoid核 SVM", "线性核 SVM", "高斯核 SVM"]
plot_combined_learning_curve(classifiers, titles, X_train, y_train, cv=5)
No description has been provided for this image